/**************************************************************
This do file creates the analysis dataset used in Abarcar and Theoharides 2021;
It merges multiple datasets from the various sources used in the paper;
**************************************************************/

capture log close
capture program drop _all
capture macro drop _all
drop _all
set more off

*Update path names
global ched_data 
global cfo_data 
global pop_data 
global psced_data 
global census_data 
global here 
global quality "/Users/ctheoharides/Box/Philippine Nurse Migration Research/Quality data/Stata/"

**************************
*MERGE NSCB CODES TO CONTROLS DATASET IN ORDER TO MERGE WITH CFO AND CHED DATA LATER;

use "${census_data}census_controls_90_v2.dta", clear
drop _merge
destring prv, replace
merge 1:1 prv using "${census_data}nscb_codes_prv_names.dta"

replace province="MOUNTAIN PROVINCE" if prv==44
replace province="THIRD DISTRICT" if prv==39

drop if _merge==2
drop _merge

tempfile census_controls
save "`census_controls'"


***CLEANING THE CHED DATASET****************************************************
use "${ched_data}finalched_v2.dta", clear

gen year=substr(acadyear,1,4)
destring year, replace
drop acadyear

drop if province==""															

drop if pscedgroup=="."															
drop if pscedgroup==""															

*Reclassify pscedgroups so that they are consistent across year
replace pscedgroup="89" if pscedgroup=="46" | pscedgroup=="90" | pscedgroup=="47"	
replace pscedgroup="0" if pscedgroup=="00"											

*label pscedgroups
merge m:1 pscedgroup using "${psced_data}pscedmatch.dta", replace update		
drop if _m==2
drop _m

*This identifies which programs are nursing programs. 
gen nurse=1 if six_digitprogdis=="501200" & pscedgroup=="50"
replace nurse=1 if six_digitprogdis=="34" & pscedgroup=="50"
replace nurse=1 if six_digitprogdis=="500202" & pscedgroup=="50"
replace nurse=1 if six_digitprogdis=="503001" & pscedgroup=="50"
replace nurse=1 if six_digitprogdis=="509903" & pscedgroup=="50"
replace nurse=0 if nurse==.

replace pscedgroup="100" if nurse==1
replace disciplinegroup="Nursing" if nurse==1

collapse (sum) emftotal eftotal emtotal gmftotal gftotal gmtotal, by(province nurse year)

*Filling in observations for provinces without nurse enrollment 
reshape wide emftotal emtotal eftotal gmftotal gmtotal gftotal, i(province year) j(nurse)
forvalues i=0(1)1 {
	replace emftotal`i'=0 if emftotal`i'==.
	replace eftotal`i'=0 if eftotal`i'==.
	replace emtotal`i'=0 if emtotal`i'==.
	replace gmftotal`i'=0 if gmftotal`i'==.
	replace gftotal`i'=0 if gftotal`i'==.
	replace gmtotal`i'=0 if gmtotal`i'==.
	}
reshape long emftotal emtotal eftotal gmftotal gmtotal gftotal, i(province year) j(nurse)

reshape wide emftotal emtotal eftotal gmftotal gmtotal gftotal, i(province nurse) j(year)
forvalues i=2001(1)2013 {
	replace emftotal`i'=0 if emftotal`i'==.
	replace eftotal`i'=0 if eftotal`i'==.
	replace emtotal`i'=0 if emtotal`i'==.
	replace gmftotal`i'=0 if gmftotal`i'==.
	replace gftotal`i'=0 if gftotal`i'==.
	replace gmtotal`i'=0 if gmtotal`i'==.
	}
reshape long emftotal emtotal eftotal gmftotal gmtotal gftotal, i(province nurse) j(year)

tostring year, gen(temp)
gen matcher=province+temp
drop temp

reshape wide emftotal emtotal eftotal gmftotal gmtotal gftotal, i(province year) j(nurse)

save "${here}chedmerge.dta", replace


***CLEANING THE CFO DATASET*****************************************************

use "${cfo_data}finalCFO.dta", clear

keep if age>=18 & age<=60 														//Keep only working age population

*Classifying which programs are nursing programs 
*Course was Nursing or occupation is a professional nurse
gen nurse_course=1 if course=="NURSING (SUPPLEMENTAL), B.S."
replace nurse_course=1 if course=="NURSING AIDE, COURSE FOR"
replace nurse_course=1 if course=="NURSING EDUCATION, B.S."
replace nurse_course=1 if course=="NURSING, B.S."
replace nurse_course=1 if course=="NURSING, GRADUATE IN"
replace nurse_course=1 if course=="NURSING, M.S."
replace nurse_course=1 if course=="NURSING, MASTER OF"
replace nurse_course=1 if course=="NURSING, PROGRAMS IN"
replace nurse_course=1 if course=="PHILOSOPHY IN NURSING, DOCTOR OF"
replace nurse_course=0 if nurse_course==.

gen nurse_prof=1 if profession=="PROFESSIONAL NURSE"
replace nurse_prof=0 if nurse_prof==.

gen nurse=1 if nurse_prof==1 | nurse_course==1
replace nurse=0 if nurse==.

gen nonnurse=1 if nurse==0
replace nonnurse=0 if nurse==1

*Classifying nurses who went to the USA
gen nurseUSA=1 if nurse==1 & countryofdestination=="UNITED STATES OF AMERICA"
replace nurseUSA=0 if nurseUSA==.

gen nonnurseUSA=1 if nurse==0 & countryofdestination=="UNITED STATES OF AMERICA"
replace nonnurseUSA=0 if nonnurseUSA==.

gen nurseCAN=1 if nurse==1&countryofdestination=="CANADA"
replace nurseCAN=0 if nurseCAN==.

collapse (sum) nurse nonnurse nurseUSA nonnurseUSA nurseCAN, by(province year) 
drop if province=="NO RESPONSE"

*Filling in observations for provinces without nurse enrollment
reshape wide nurse nonnurse nurseUSA nonnurseUSA nurseCAN, i(province) j(year)
forvalues i=1988(1)2014 {
	replace nurse`i'=0 if nurse`i'==.
	replace nonnurse`i'=0 if nonnurse`i'==.
	}
reshape long nurse nonnurse nurseUSA nonnurseUSA nurseCAN, i(province) j(year)

tostring year, gen(temp)
gen matcher=province+temp
drop temp

save "${here}cfomerge.dta", replace


***MERGING DATASETS*************************************************************

merge m:1 matcher using "${here}chedmerge.dta" 									
drop _m

merge m:1 province year using "${pop_data}totpop.dta"
drop if _m==2																	
drop _m

merge m:1 province year using "${pop_data}collegepop.dta"
drop if _m==2
drop _m

*Reassign to original provinces before provincial divisons;
replace province="ZAMBOANGA DEL SUR" if province=="ZAMBOANGA SIBUGAY"											
replace province="SURIGAO DEL NORTE" if province=="DINAGAT ISLANDS"

replace province="SOUTH COTABATO" if province=="SARANGANI"						
replace province="DAVAO DEL NORTE" if province=="COMPOSTELA VALLEY"

replace province="KALINGA" if province=="APAYAO"						

*Merging all districts of Manila into one
replace province="THIRD DISTRICT" if province=="FIRST DISTRICT"					
replace province="THIRD DISTRICT" if province=="SECOND DISTRICT"				
replace province="THIRD DISTRICT" if province=="FOURTH DISTRICT"

drop matcher

*collapsing because of merging of province data for Metro Manila
collapse (sum) nurse nonnurse nursebroad nurseUSA nonnurseUSA nurseCAN nurseUSA_petition totpop* emftotal0 eftotal0 emtotal0 	///
gmftotal0 gftotal0 gmtotal0 emftotal1 eftotal1 emtotal1 gmftotal1 gftotal1 		///
gmtotal1 collegepop*, by(province year)

*CHED data are not included in the raw dataset for some variables in certain years
*Replace as missing for those variable-years
forvalues i=0(1)1 {
	replace eftotal`i'=. if year==1997
	replace emtotal`i'=. if year==1997
	replace gmftotal`i'=. if (year>=1990 & year<=1993) | year==1997
	replace gftotal`i'=. if year>=1990 & year<=1997
	replace gmtotal`i'=. if year>=1990 & year<=1997
	}
	

*MERGE IN CENSUS CONTROLS
merge n:1 province using "`census_controls'"
drop _merge

*MERGE IN NURSE PROGRAM DATA;
merge 1:1 province year using "${ched_data}nurse_prog.dta"
drop _merge

erase "${here}cfomerge.dta"
erase "${here}chedmerge.dta"

***SPECIFYING VARIABLES FOR ANALYSIS********************************************
*DROP SULU AND TAWI-TAWI FROM ANALYSIS (DATA ARE IDENTICAL ACROSS ALL YEARS);
drop if province=="TAWI-TAWI"			
drop if province=="SULU"

local baseyear1=1990
local baseyear2=1991

gen migrate=nurse/totpop*100
gen migrateUSA=nurseUSA/totpop*100
* Create Nurse migrants per hundred thousand;
gen migrateUSA_hth=nurseUSA/(totpop/100000)
gen migrate_nonnurse=nonnurse/totpop*100
gen migrate_nonnurse_hth=nonnurse/(totpop/100000)

gen migrateCAN=nurseCAN/totpop*100
gen migrateUSA_petition=nurseUSA_petition/totpop*100

gen migrant=nurse+nonnurse
gen migrate_total=(migrant/totpop)*100
gen migrate_total_hth=(migrant/(totpop/100000))

gen temp`baseyear1'=migrateUSA if year==`baseyear1'
gen temp`baseyear2'=migrateUSA if year==`baseyear2'
gen temp`baseyear1'_p=migrateUSA_petition if year==`baseyear1'

bys province: egen premigrate`baseyear1'=max(temp`baseyear1')
bys province: egen premigrate`baseyear2'=max(temp`baseyear2')
bys province: egen premigrate`baseyear1'_petition=max(temp`baseyear1')

drop temp`baseyear1' temp`baseyear2' temp`baseyear1'_p

gen post2000=1 if year>=2000
replace post2000=0 if post2000==. 

gen post2007=1 if year>=2007
replace post2007=0 if post2007==.

*Create post variables for 4 years in the future for graduation results
gen post2003=1 if year>=2003
replace post2003=0 if post2003==.

gen post2010=1 if year>=2010
replace post2010=0 if post2010==.

sum premigrate`baseyear1', detail
gen high1=1 if premigrate`baseyear1'>=`r(p50)'
replace high1=0 if high1==.

sum premigrate`baseyear2', detail
gen high2=1 if premigrate`baseyear2'>=`r(p50)'
replace high2=0 if high2==.

sum premigrate`baseyear1'_petition, detail
gen high1_petition=1 if premigrate`baseyear1'_petition>=`r(p50)'
replace high1_petition=0 if high1_petition==.

gen inter2000_`baseyear1'=post2000*premigrate`baseyear1'
gen inter2000_`baseyear2'=post2000*premigrate`baseyear2'
gen inter2007_`baseyear1'=post2007*premigrate`baseyear1'
gen inter2007_`baseyear2'=post2007*premigrate`baseyear2'

gen interhigh2000_`baseyear1'=post2000*high1
gen interhigh2000_`baseyear2'=post2000*high2
gen interhigh2007_`baseyear1'=post2007*high1
gen interhigh2007_`baseyear2'=post2007*high2

forvalues i=0(1)1 {
gen enroll_tot`i'=emftotal`i'/collegepop*100
gen enroll_f`i'=eftotal`i'/collegepop*100
gen enroll_m`i'=emtotal`i'/collegepop*100
gen grad_tot`i'=gmftotal`i'/collegepop*100
gen grad_f`i'=gftotal`i'/collegepop*100
gen grad_m`i'=gmtotal`i'/collegepop*100

gen enroll_ff`i'=eftotal`i'/collegepop_f*100
gen enroll_mm`i'=emtotal`i'/collegepop_m*100
}

*Generate the share of male nurses out of the total;
gen male_nurse_share=emtotal1/emftotal1

gen enroll_tot=enroll_tot0+enroll_tot1
gen enroll_f=enroll_f1+enroll_f0
gen enroll_m=enroll_m1+enroll_m0

gen grad_tot=grad_tot1+grad_tot0
gen grad_f=grad_f1+grad_f0
gen grad_m=grad_m1+grad_m0

encode province, gen(provcode)

tab provcode, gen(ttrend)

*Create province specific time trends;
sum provcode
forvalues i=1(1)`r(max)' {
	replace ttrend`i'=ttrend`i'*year
	}

forvalues x=1991/2014 {
	gen yr`x'=0
	replace yr`x'=1 if year==`x'
	gen yr`x'_pre=yr`x'*high1
}

*Create trend variable for pooled analysis;
egen ttrend = group(year)

*Create baseline controls x year;
foreach var in emp_f_2564_ emp_m_2564_  urban_ age_ female_ _yrschl_2564_ {
		
	forvalues y=1991/2014 {
		gen `var'90_yr`y'=`var'90*yr`y'
	}
}

*Fill in region for missing years;
bys province: egen temp=max(regn)
replace regn=temp if regn==.
*Fix missing region for Mountain Province;
replace regn=14 if province=="MOUNTAIN PROVINCE";

*See data appendix: replace extreme outliers for enrollment and graduation as missing; 
		foreach l of varlist enroll_tot1 enroll_tot0 enroll_tot denroll_tot1 enroll_f1 enroll_m1 enroll_ff1 enroll_mm1 male_nurse_share enroll_f0 enroll_m0 enroll_ff0 enroll_mm0 enroll_f enroll_m {
			replace `l'=. if province=="AGUSAN DEL NORTE" & year==1994
			replace `l'=. if province=="AGUSAN DEL NORTE" & year==1996
			replace `l'=. if province=="AURORA" & year==1996
			replace `l'=. if province=="BOHOL" & year==1994
			replace `l'=. if province=="BOHOL" & year==1997
			replace `l'=. if province=="BUKIDNON" & year==1994
			replace `l'=. if province=="CAMARINES NORTE" & year==1994
			replace `l'=. if province=="CATANDUANES" & year==1996
			replace `l'=. if province=="CATANDUANES" & year==2001
			replace `l'=. if province=="CEBU" & year==1997
			replace `l'=. if province=="DAVAO DEL NORTE" & year==1996
			replace `l'=. if province=="DAVAO ORIENTAL" & year==1993
			replace `l'=. if province=="GUIMARAS" & year==2012
			replace `l'=. if province=="LEYTE" & year==1997
			replace `l'=. if province=="MARINDUQUE" & year==1997
			replace `l'=. if province=="MARINDUQUE" & year==2001				
			replace `l'=. if province=="MISAMIS OCCIDENTAL" & year==1994
			replace `l'=. if province=="MISAMIS ORIENTAL" & year==1994
			replace `l'=. if province=="NEGROS ORIENTAL" & year==1997
			replace `l'=. if province=="NUEVA VIZCAYA" & year==1995
			replace `l'=. if province=="PALAWAN" & year==1997
			replace `l'=. if province=="RIZAL" & year==1994
			replace `l'=. if province=="ROMBLON" & year==1994
			replace `l'=. if province=="ROMBLON" & year==1996
			replace `l'=. if province=="ROMBLON" & year==2007
			replace `l'=. if province=="SIQUIJOR" & year==1997
			replace `l'=. if province=="SIQUIJOR" & year==2007
			replace `l'=. if province=="SOUTH COTABATO" & year==1995
			replace `l'=. if province=="SURIGAO DEL NORTE" & year==1994
			replace `l'=. if province=="SURIGAO DEL NORTE" & year==1996
			replace `l'=. if province=="THIRD DISTRICT" & year==1994
			replace `l'=. if province=="ZAMBOANGA DEL SUR" & year==1996
			}
			
		foreach l of varlist grad_tot0 grad_tot1 grad_tot dgrad_tot1 dgrad_tot0 dgrad_tot grad_f0 grad_m0 grad_f grad_m grad_m1 grad_f1 {
			replace `l'=. if province=="AURORA" & year==2011										
			replace `l'=. if province=="BATANES" & year==1994
			replace `l'=. if province=="CAGAYAN" & year==2011
			replace `l'=. if province=="CAMARINES SUR" & year==2006
			replace `l'=. if province=="CAMARINES SUR" & year==2007
			replace `l'=. if province=="CAMIGUIN" & year==2000
			replace `l'=. if province=="CAPIZ" & year==2009
			replace `l'=. if province=="CATANDUANES" & year==2006									
			replace `l'=. if province=="GUIMARAS" & year==2002
			replace `l'=. if province=="LANAO DEL NORTE" & year==2011
			replace `l'=. if province=="OCCIDENTAL MINDORO" & year==2001
			replace `l'=. if province=="OCCIDENTAL MINDORO" & year==2003
			replace `l'=. if province=="ORIENTAL MINDORO" & year==1996
			replace `l'=. if province=="PALAWAN" & year==1996
			replace `l'=. if province=="ROMBLON" & year==2007	
			replace `l'=. if province=="SIQUIJOR" & year==2007
			replace `l'=. if province=="TARLAC" & year==2010										
			replace `l'=. if province=="TARLAC" & year==2011										
		}

foreach var of varlist grad_tot1 grad_tot0 grad_tot grad_m1 grad_f1 grad_m0 grad_f0 grad_m grad_f{
	replace `var'=. if year<1998
}

*Merge in Nurse Quality Data;
*These are created in 2_create_quality_variables.do;
merge 1:1 province year using "${quality}quality_province_year.dta", gen(_merge2)
drop if _merge2==2
drop _merge2

*MERGE IN CENSUS NURSE COUNTS;
merge 1:1 province year using "${census_data}Cleaned Data/Census_nurse_counts.dta", gen(_merge3)
drop if _merge3==2
drop _merge3

*Create baseline domestic nurses (from Census) per capita times year FE;
gen temp_nurse_cen90=nurse_cen if year==1990
bys province: egen nurse_cen90=max(temp_nurse_cen90)
gen nurse_cen_pc=nurse_cen/pop_90
gen temp_nurse_cen_pc90=nurse_cen_pc if year==1990
bys province: egen nurse_cen_pc90=max(temp_nurse_cen_pc90) 
drop temp_nurse*

save "${ched_data}analysis_sample.dta", replace




